#include <cstdio>

__global__ void hello_cuda() {
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    printf("gobal thread id \t %d \n", idx);
    printf("block id \t %d \n", blockIdx.x);
    printf("local thread id \t %d \n", threadIdx.x);
}

int main() {
    hello_cuda<<<3, 2>>>();
    cudaDeviceSynchronize(); // 强制等待GPU完成,实现同步
    return 0;
}
